Bring rectangular data in

## load gapminder
suppressPackageStartupMessages(library(gapminder))
## load tidyverse
suppressPackageStartupMessages(library(tidyverse))
## load forcats
suppressPackageStartupMessages(library(forcats))
## load plotly
suppressPackageStartupMessages(library(plotly))
## load scale
suppressPackageStartupMessages(library(scales))

Part 1: Factor management

Drop Oceania

Before dropping Oceania, let’s check the number of rows in gapminder, and the levels of continent.

gapminder %>% 
  # get the number of rows and levels of continent
  summarize(
    nrow = nrow(gapminder),
    nlevels = nlevels(gapminder$continent)
  ) %>% 
  # show the table
  knitr::kable(col.names = c("Number of rows", "Levels of `continent`"))
Number of rows Levels of continent
1704 5

Let’s first try to drop Oceania, and show the number of rows, and the levels of continent.

# drop Oceania
drop <- gapminder %>% 
  filter(continent != "Oceania")

drop %>% 
  # get the number of rows and levels of continent
  summarize(
    nrow = nrow(drop),
    nlevels = nlevels(drop$continent)
  ) %>% 
  # show the table
  knitr::kable(col.names = c("Number of rows after dropping", "Levels of `continent` after dropping"))
Number of rows after dropping Levels of continent after dropping
1680 5

Here we can see, even we remove all rows associated with the continent of Oceania (so the number of rows decrease), the levels of continent is unchanged. However, it is a wrong value. To fix it, we need to droplevels() to solve the issue.

# drop levels
drop_levels <- drop %>% 
  droplevels()

drop_levels %>% 
  # get the number of rows and levels of continent
  summarize(
    nrow = nrow(drop_levels),
    nlevels = nlevels(drop_levels$continent)
  ) %>% 
  # show the table
  knitr::kable(col.names = c("Number of rows after dropping levels", "Levels of `continent` after dropping levels"))
Number of rows after dropping levels Levels of continent after dropping levels
1680 4

To summarize, we address the number of rows and the levels of continent before and after removing Oceania.

# build a tibble with statistics
summary <- cbind(
  before = c(
    nrow(gapminder),
    nlevels(gapminder$continent)
  ),
  after = c(
    nrow(drop_levels),
    nlevels(drop_levels$continent)
  )
)

# assign row names
rownames(summary) <- c("Number of rows", "Levels")

# display the table
summary %>% 
  knitr::kable()
before after
Number of rows 1704 1680
Levels 5 4

Reorder the levels of country and continent

Let’s show a preview of our dropped version of gapminder before reordering.

# show previews
head(drop_levels) %>% 
  knitr::kable()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453
Afghanistan Asia 1957 30.332 9240934 820.8530
Afghanistan Asia 1962 31.997 10267083 853.1007
Afghanistan Asia 1967 34.020 11537966 836.1971
Afghanistan Asia 1972 36.088 13079460 739.9811
Afghanistan Asia 1977 38.438 14880372 786.1134
tail(drop_levels) %>% 
  knitr::kable()
country continent year lifeExp pop gdpPercap
Zimbabwe Africa 1982 60.363 7636524 788.8550
Zimbabwe Africa 1987 62.351 9216418 706.1573
Zimbabwe Africa 1992 60.377 10704340 693.4208
Zimbabwe Africa 1997 46.809 11404948 792.4500
Zimbabwe Africa 2002 39.989 11926563 672.0386
Zimbabwe Africa 2007 43.487 12311143 469.7093

From a bigger picture, this data frame is ordered by the alphabetical order of country. Let’s try to reorder the levels of country by the maximum population over the years using fct_reorder(). Note that it is reordered in ascending order, so the first level is the one with least maximum population over the year.

fct_reorder(
  # reorder country
  drop_levels$country,
  # by pop
  drop_levels$pop,
  # using maximum
  max) %>%
  # show preview of resultant levels
  levels() %>% 
  head() %>% 
  knitr::kable(col.names = c("`country` after reordering"))
country after reordering
Sao Tome and Principe
Iceland
Djibouti
Equatorial Guinea
Bahrain
Comoros

In order to check our results, we manually calculate the maximum population and order it.

drop_levels %>% 
  # group by country
  group_by(country) %>% 
  # calcuate maximum population for each country
  summarize(
    max_pop = max(pop)
  ) %>% 
  # arrange by max_pop
  arrange(max_pop) %>%
  # show preview
  head() %>% 
  knitr::kable(col.names = c("country", "Maximum population over the years"))
country Maximum population over the years
Sao Tome and Principe 199579
Iceland 301931
Djibouti 496374
Equatorial Guinea 551201
Bahrain 708573
Comoros 710960

So our operation using fct_reorder is correct.

We can do the same thing on continent. For example, we reorder its levels by the average life expectancy.

fct_reorder(
  # reorder continent
  drop_levels$continent,
  # by lifeExp
  drop_levels$lifeExp,
  # using mean
  mean) %>%
  # show preview of resultant levels
  levels() %>% 
  head() %>% 
  knitr::kable(col.names = c("`continent` after reordering"))
continent after reordering
Africa
Asia
Americas
Europe

We also double-check the results.

drop_levels %>% 
  # group by continent
  group_by(continent) %>% 
  # calcuate mean lifeExp for each continent
  summarize(
    mean_lifeExp = mean(lifeExp)
  ) %>% 
  # arrange by mean_lifeExp
  arrange(mean_lifeExp) %>%
  # show preview
  head() %>% 
  knitr::kable(col.names = c("continent", "Average life expectancy over the years"))
continent Average life expectancy over the years
Africa 48.86533
Asia 60.06490
Americas 64.65874
Europe 71.90369

Characterize the (derived) data

In the previous section, we try to create two examples using fct_reorder() and arrange(), with identical results. Now, we try to reuse the first example and see if these two functions affect figures generated.

Let’s try the first example, using fct_reorder() only.

drop_levels %>%
  # try to only show a continent
  filter(continent == "Americas") %>% 
  # group by country
  group_by(country) %>% 
  # calcuate maximum population for each country
  mutate(
    max_pop = max(pop)
  ) %>%
  # show preview of resultant levels
  ggplot(aes(x = max_pop, y = fct_reorder(country, pop, max), color = country)) +
  # make it a scatterplot
  geom_point() + 
  # scale x axis by log10
  scale_x_log10() +
  # change axis labels
  xlab("Maximum population") +
  ylab("country") +
  # add title
  labs(title = "Maximum population of countries in Americas") +
  # change theme
  theme_bw()

Let’s do it with arrange() only.

drop_levels %>% 
  # try to only show a continent
  filter(continent == "Americas") %>% 
  # group by country
  group_by(country) %>% 
  # calcuate maximum population for each country
  summarize(
    max_pop = max(pop)
  ) %>% 
  # arrange by max_pop
  arrange(max_pop) %>%
  # show preview of resultant levels
  ggplot(aes(x = max_pop, y = country, color = country)) +
  # make it a scatterplot
  geom_point() + 
  # scale x axis by log10
  scale_x_log10() +
  # change axis labels
  xlab("Maximum population") +
  ylab("country") +
  # add title
  labs(title = "Maximum population of countries in Americas") +
  # change theme
  theme_bw()

Now we use fct_reorder() with arrange().

drop_levels %>% 
  # try to only show a continent
  filter(continent == "Americas") %>% 
  # group by country
  group_by(country) %>% 
  # calcuate maximum population for each country
  mutate(
    max_pop = max(pop)
  ) %>% 
  # arrange by max_pop
  arrange(max_pop) %>%
  # show preview of resultant levels
  ggplot(aes(x = max_pop, y = fct_reorder(country, pop, max), color = country)) +
  # make it a scatterplot
  geom_point() + 
  # scale x axis by log10
  scale_x_log10() +
  # change axis labels
  xlab("Maximum population") +
  ylab("country") +
  # add title
  labs(title = "Maximum population of countries in Americas") +
  # change theme
  theme_bw()

Finding(s): using fct_reorder() (no matter with arrange() or not) will change the order of levels shown in figures, while using arrange() along will not affect the order of levels.

Part 2: File I/O

In order to test if mutated data can survire the round trip of writing and then reading back from a file, we first use the first example in Part 1 to mutate gapminder.

data <- gapminder %>% 
  # mutate country by maximum population
  mutate(
    country = fct_reorder(country, pop, max)
  ) %>% 
  # group by country
  group_by(country) %>% 
  # calculate maximum population
  summarize(
    max_pop = max(pop)
  )
  
# show summary of the data
data %>% 
  glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data) %>% 
  knitr::kable()
country max_pop
Sao Tome and Principe 199579
Iceland 301931
Djibouti 496374
Equatorial Guinea 551201
Bahrain 708573
Comoros 710960
tail(data) %>% 
  knitr::kable()
country max_pop
Pakistan 169270617
Brazil 190010647
Indonesia 223547000
United States 301139947
India 1110396331
China 1318683096

write_csv()/read_csv()

# write to csv
write_csv(data, "data_csv.csv")
# read from csv
data_csv <- read_csv("data_csv.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   max_pop = col_double()
## )
# show summary of the data
data_csv %>% 
  glimpse()
## Observations: 142
## Variables: 2
## $ country <chr> "Sao Tome and Principe", "Iceland", "Djibouti", "Equat...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_csv) %>% 
  knitr::kable()
country max_pop
Sao Tome and Principe 199579
Iceland 301931
Djibouti 496374
Equatorial Guinea 551201
Bahrain 708573
Comoros 710960
tail(data_csv) %>% 
  knitr::kable()
country max_pop
Pakistan 169270617
Brazil 190010647
Indonesia 223547000
United States 301139947
India 1110396331
China 1318683096

Finding(s): using CSV format, it requires R to parse each column with default formats. Therefore, the class of country changes from <fct> to <chr>. However, the data keeps unchanged.

saveRDS()/readRDS()

# save to rds
saveRDS(data, "data_rds.rds")
# read from rds
data_rds <- readRDS("data_rds.rds")

# show summary of the data
data_rds %>% 
  glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_rds) %>% 
  knitr::kable()
country max_pop
Sao Tome and Principe 199579
Iceland 301931
Djibouti 496374
Equatorial Guinea 551201
Bahrain 708573
Comoros 710960
tail(data_rds) %>% 
  knitr::kable()
country max_pop
Pakistan 169270617
Brazil 190010647
Indonesia 223547000
United States 301139947
India 1110396331
China 1318683096

Finding(s): saveRDS() and readRDS() can keep both data and classes of each column.

dput()/dget()

# put to file
dput(data, "data.txt")
# get from file
data_get <- dget("data.txt")

# show summary of the data
data_get %>% 
  glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_get) %>% 
  knitr::kable()
country max_pop
Sao Tome and Principe 199579
Iceland 301931
Djibouti 496374
Equatorial Guinea 551201
Bahrain 708573
Comoros 710960
tail(data_get) %>% 
  knitr::kable()
country max_pop
Pakistan 169270617
Brazil 190010647
Indonesia 223547000
United States 301139947
India 1110396331
China 1318683096

Finding(s): dput() and dget() can also keep both data and classes of each column.

Part 3: Visualization design

Task 1

Remake at least one figure or create a new one, in light of something you learned in the recent class meetings about visualization design and color.

Here I want to remake an figure I used in the first assignment. It tries to show the trends of population over the years for each continent.

# this is the figure copy from assignment 1, so no fancy operation is done
p_ori <- gapminder %>% 
  # year as x axis and pop as y axis
  ggplot(aes(x = year, y = pop)) +
  # scale y axis by log10
  scale_y_log10() +
  # make it a scatterplot, and add transparancy
  geom_point(alpha = 0.1) +
  # show colors
  aes(color = continent) + 
  # facetting by continent
  facet_wrap(~ continent)
p_ori

We cannot get too much information for this figure. Therefore, we try to tidy up data, and make better presentation (e,g, color, size) to make it more reasonable.

# tidy up data
tidy_up <- gapminder %>% 
  # group by year and continent
  group_by(year, continent) %>%
  # calcuate summaries of pop
  summarize(
    min_pop = min(pop),
    mean_pop = mean(pop),
    max_pop = max(pop)
  ) %>% 
  # gather pop into two columns
  gather(
    key = "pop_attribute",
    value = "pop_val",
    min_pop, mean_pop, max_pop
  )

# make new plot
p_new <- tidy_up %>% 
  # year as x axis and pop as y axis, group by pop_attribute
  ggplot(aes(x = year, y = pop_val, color = pop_attribute, group = pop_attribute)) +
  # scale y axis by log10
  scale_y_log10() +
  # make it a line plot (with points)
  geom_point() +
  geom_line() +
  # facetting by continent
  facet_wrap(~ continent) +
  # modify labels
  ylab("Population distribution") +
  scale_color_discrete("Population distribution") +
  # add title
  labs(
    title = "Population distribution of each continent over the years"
  ) +
  # make a better x axis
  scale_x_continuous(breaks = scales::pretty_breaks(n = 4)) +
  # add theme
  theme_bw() +
  theme(
    strip.background = element_rect(fill = "grey"),
    panel.background = element_rect(fill = "white")
  )
p_new

Difference(s): the color in the new figure is not for each continent, but for different distributions of each continent, which makes more sense. In addition, we try to tidy up data into different distribution attributes, so the trends over the years are clearer.

Let’s create a new plot. In this new plot, we show maximum GDP per capita of each country in continent Americas. In addition, to make the figure clearer, we only show countries with maximum GDP per capita larger than 13,000.

# tidy up data
tidy_up_gdp <- gapminder %>% 
  # filter only countries in Americas
  filter(continent == "Americas") %>% 
  # group by country
  group_by(country) %>%
  # calcuate summaries of pop
  summarize(
    max_gdp = max(gdpPercap)
  ) %>% 
  # filter maximum GDP per capita that are bigger than 13,000
  filter(max_gdp >= 13000)

# make new plot
p_new_gdp <- tidy_up_gdp %>% 
  # country as x axis and max_gdp as y axis
  ggplot(aes(x = country, y = max_gdp)) +
  # make it a scatterplot, change size as max_gdp, and fill by country
  geom_point(aes(size = max_gdp, fill = country), pch = 21) +
  # change color based on gapminder country color scheme
  scale_fill_manual(values = country_colors) +
  # change format of size
  scale_size_continuous(
    labels = dollar_format()
  ) +
  # add labels
  labs(
    x = "Country",
    y = "Maximum GDP per Capita",
    title = "Maximum GDP per Capita over 10,000 for countries in Americas",
    size = "Maximum GDP per Capita",
    fill = "Country"
  ) +
  # add theme
  theme_bw() +
  theme(
    axis.text = element_text(size = 6)
  )
p_new_gdp

Task 2

Make a new graph by converting this visual (or another, if you’d like) to a plotly graph.

plotly is an R package for creating interactive web-based graphs. In the following simple example, the differences of using plotly are:

  • There is a floating toolbar for users to interact with the figure, e.g saving figure, zooming.
  • When users hover on each data point, the details of that data point are shown in a floating window.
# convert a ggplot into plotly
ggplotly(p_new_gdp)

In addition, we can make 3D plot using plotly.

gapminder %>% 
  # group by year and continent
  group_by(year, continent) %>%
  # calcuate summaries of pop
  summarize(
    max_pop = max(pop)
  ) %>% 
  # group by continent
  group_by(continent) %>% 
  # make a 3D plot
  plot_ly(
    x = ~year,
    y = ~continent,
    z = ~max_pop,
    type = "scatter3d",
    mode = "markers + lines",
    color = ~continent
  ) %>% 
  # change layout of the 3D plot
  layout(
    scene = list(
      xaxis = list(title = "Year"),
      yaxis = list(title = "Continent"),
      zaxis = list(title = "Maximum Population")
    )
  )

Part 4: Writing figures to file

Arguments of ggsave()

Let’s play around arguments of ggsave(). For example, width, height, resolution, and text scale.

Differenct widths and heights

# default width and height
ggsave("./ggsave/1.png")
## Saving 7 x 5 in image
1.png

1.png

# arbitrary width and height
ggsave("./ggsave/2.png", width = 10, height = 5)
2.png

2.png

Different resolutions

Here we try to generate image of two resolutions, one as normal and one very low.

# normal resolution
ggsave("./ggsave/3.png", dpi = 300)
## Saving 7 x 5 in image
3.png

3.png

# low resolution
ggsave("./ggsave/4.png", dpi = 10)
## Saving 7 x 5 in image
4.png

4.png

Various graphics devices

# vector format
ggsave("./ggsave/5.svg", device = "svg")
## Saving 7 x 5 in image
5.svg

5.svg

# raster format
ggsave("./ggsave/6.bmp", device = "bmp")
## Saving 7 x 5 in image
6.bmp

6.bmp

The diffience is, vector format images store polygons, so when it is scaled, the polygons are scaled as well, so they always look the same. While for raster format images, pixels are stored, so image becomes blurry when scaled up.

Explicit provision of the plot object p

Without explicit provision of the plot object p, ggsave() always tries to save the lastest figure (in our case, it is p_new_gdp). Therefore, we need to provide the plot object if we want to save other figures, for example, p_new.

# save p_ori
ggsave("./ggsave/7.png", plot = p_new)
## Saving 7 x 5 in image
7.png

7.png

But I want to do more!

Make a deeper exploration of the forcats packages.

According to this reference, we try to explore the following functions of forcats packages:

fct_relevel()

We can use fct_relevel() to manually reorder levels. For example, we can move Canada to be the first data in this example (which in the figure is the bottom one).

drop_levels %>% 
  # try to only show a continent
  filter(continent == "Americas") %>% 
  # group by country
  group_by(country) %>% 
  # calcuate maximum population for each country
  summarize(
    max_pop = max(pop)
  ) %>% 
  # arrange by max_pop
  arrange(max_pop) %>%
  # show preview of resultant levels
  ggplot(aes(x = max_pop, y = fct_relevel(country, "Canada"), color = country)) +
  # make it a scatterplot
  geom_point() + 
  # scale x axis by log10
  scale_x_log10() +
  # change axis labels
  xlab("Maximum population") +
  ylab("country") +
  # add title
  labs(title = "Maximum population of countries in Americas") +
  # highlight Canada
  scale_y_discrete(labels = c("Canada" = expression(bold(Canada)), parse = TRUE)) +
  # add theme
  theme_bw() +
  theme(
    strip.background = element_rect(fill = "grey"),
    panel.background = element_rect(fill = "white")
  )

fct_reorder2()

We can use fct_reorder2() to reorder by two variables. For example, in the following example, we reorder by population first then GDP per capita (though it is not obvious).

gapminder %>% 
  # show only Oceania
  filter(continent == "Oceania") %>% 
  # pop as x axis and gdpPercap as y axis
  ggplot(aes(x = pop, y = gdpPercap, color = fct_reorder2(country, pop, gdpPercap, min))) +
  # make it a line plot
  geom_line() + 
  # change label
  labs(color = "Country") +
  # add title
  labs(
    x = "Population",
    y = "GDP per capita",
    title = "GDP per capita vs population"
  ) +
  # scale x by log10
  scale_x_log10() +
  # add theme
  theme_bw()

fct_infreq() and fct_rev()

We use fct_infreq() to order levels in increasing frequency and fct_rev() to reverse order. They are usually used in bar plots.

gapminder %>% 
  # show only a continent
  filter(continent == "Americas") %>% 
  # filter countreis with gdpPercap larger than 15,000
  filter(gdpPercap >= 15000) %>% 
  # reorder levels of country
  mutate(
    country = country %>% fct_infreq() %>% fct_rev()
  ) %>% 
  # country as x axis
  ggplot(aes(x = country, fill = country)) +
  # make it as a bar plot
  geom_bar() +
  # add title
  labs(
    x = "Country",
    y = "Count",
    title = "Number of years with GDP per capita >= 15,000"
  ) +
  # add theme
  theme_bw()

fct_recode()

fct_recode() can be used to manully change the names of levels. For example, we try to change the name of “United States” in the above example.

gapminder %>% 
  # show only a continent
  filter(continent == "Americas") %>% 
  # filter countreis with gdpPercap larger than 15,000
  filter(gdpPercap >= 15000) %>% 
  # reorder levels of country, and change name of "United States"
  mutate(
    country = country %>% fct_infreq() %>% fct_rev() %>% 
      fct_recode(
        "USA" = "United States"
      )
  ) %>% 
  # country as x axis
  ggplot(aes(x = country, fill = country)) +
  # make it as a bar plot
  geom_bar() +
  # add title
  labs(
    x = "Country",
    y = "Count",
    title = "Number of years with GDP per capita >= 15,000"
  ) +
  # add theme
  theme_bw()

fct_collapse()

We can try to use fct_collapse() to manually combine levels into groups. For example, we combined “Canada” and “USA” together in the above example.

gapminder %>% 
  # show only a continent
  filter(continent == "Americas") %>% 
  # filter countreis with gdpPercap larger than 15,000
  filter(gdpPercap >= 15000) %>% 
  # reorder levels of country, and change name of "United States", and combine "Canada" and "USA"
  mutate(
    country = country %>% fct_infreq() %>% fct_rev() %>% 
      fct_recode(
        "USA" = "United States"
      ) %>% 
      fct_collapse(
        "North Americas" = c("Canada", "USA")
      )
  ) %>% 
  # country as x axis
  ggplot(aes(x = country, fill = country)) +
  # make it as a bar plot
  geom_bar() +
  # add title
  labs(
    x = "Country",
    y = "Count",
    title = "Number of years with GDP per capita >= 15,000"
  ) +
  # add theme
  theme_bw()

fct_lump()

fct_lump() try to automaticaly aggregate groups.

gapminder %>% 
  # show only a continent
  filter(continent == "Americas") %>% 
  # filter countreis with gdpPercap larger than 15,000
  filter(gdpPercap >= 15000) %>% 
  # reorder levels of country, and let forcats lump groups for us
  mutate(
    country = country %>% fct_infreq() %>% fct_rev() %>% fct_lump()
  ) %>% 
  # country as x axis
  ggplot(aes(x = country, fill = country)) +
  # make it as a bar plot
  geom_bar() +
  # add title
  labs(
    x = "Country",
    y = "Count",
    title = "Number of years with GDP per capita >= 15,000"
  ) +
  # add theme
  theme_bw()

References